# IMPORT LIBRARIES and PACKAGES
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, sys, pathlib, random, h5py, PIL, glob
from PIL import Image
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import torch
import torchvision
from torchvision import datasets
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from torchvision.models import ResNet34_Weights
from IPython.display import display
# For reading the bad images
import tensorflow as tf
import tensorflow.compat.v2 as tf
%matplotlib inline
AI company has a 2 GB dataset containing over 15,000 images of indoor locations. Originally from MIT, this dataset was built to work on indoor scene recognition. There are 67 categories of JPEG images. The number of images per category varies, but there are at least 100 images for each category.
Data was obtained from: https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019
from google.colab import drive
# Note: tried to use jupyter notebook from anaconda directly but a recent update caused errors, so using Colab with Gdrive instead.
drive.mount('/content/gdrive')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
# Repasted to delete output from html
!unzip gdrive/MyDrive/archive.zip
#set the parent folder that contains the subfolders that are the lables
img_dir = pathlib.Path('archive/indoorCVPR_09/Images/')
print(f'The indoorCVPR_09 photes are stored in local directory : {img_dir}')
The indoorCVPR_09 photes are stored in local directory : archive/indoorCVPR_09/Images
total_files = 0
for root, dirs, files in os.walk(str(img_dir)):
level = root.replace(str(img_dir), '').count(os.sep)
indent = ' ' * 4 * (level)
print(f'{indent}{os.path.basename(root)}/ ({len(files)} files)')
total_files += len(files)
print(f'There are {total_files} images in this dataset')
Images/ (0 files)
deli/ (258 files)
florist/ (103 files)
artstudio/ (140 files)
concert_hall/ (103 files)
classroom/ (113 files)
poolinside/ (174 files)
trainstation/ (153 files)
corridor/ (346 files)
waitingroom/ (151 files)
videostore/ (110 files)
church_inside/ (180 files)
dentaloffice/ (131 files)
airport_inside/ (608 files)
elevator/ (101 files)
computerroom/ (114 files)
auditorium/ (176 files)
bakery/ (405 files)
museum/ (168 files)
inside_subway/ (457 files)
closet/ (135 files)
warehouse/ (506 files)
kitchen/ (734 files)
office/ (109 files)
kindergarden/ (127 files)
jewelleryshop/ (157 files)
toystore/ (347 files)
restaurant_kitchen/ (107 files)
hospitalroom/ (101 files)
movietheater/ (175 files)
bedroom/ (662 files)
bookstore/ (380 files)
bathroom/ (197 files)
fastfood_restaurant/ (116 files)
garage/ (103 files)
gameroom/ (127 files)
grocerystore/ (213 files)
operating_room/ (135 files)
subway/ (539 files)
buffet/ (111 files)
gym/ (231 files)
shoeshop/ (116 files)
greenhouse/ (101 files)
tv_studio/ (166 files)
winecellar/ (269 files)
dining_room/ (274 files)
mall/ (176 files)
stairscase/ (155 files)
restaurant/ (513 files)
bar/ (604 files)
clothingstore/ (106 files)
cloister/ (120 files)
studiomusic/ (108 files)
casino/ (515 files)
nursery/ (144 files)
meeting_room/ (233 files)
pantry/ (384 files)
prisoncell/ (103 files)
hairsalon/ (239 files)
children_room/ (112 files)
bowling/ (213 files)
inside_bus/ (102 files)
There are 14056 images in this dataset
IndoorImage_dir = [ name for name in list(os.listdir(img_dir)) if os.path.isdir(os.path.join(img_dir, name)) ]
print(f' The Indoor Image labels = {IndoorImage_dir}')
# SORT the directories in alphabetical order
IndoorImage_dir.sort()
print(f'\n The SORTED Indoor Image labels = {IndoorImage_dir}')
print(f'\nThere are {len(IndoorImage_dir)} classes of Indoor Images.') ### There are (67) classes of Indoor Images.
The Indoor Image labels = ['deli', 'florist', 'artstudio', 'concert_hall', 'classroom', 'poolinside', 'trainstation', 'corridor', 'waitingroom', 'videostore', 'church_inside', 'dentaloffice', 'airport_inside', 'elevator', 'computerroom', 'auditorium', 'bakery', 'museum', 'inside_subway', 'closet', 'warehouse', 'kitchen', 'office', 'kindergarden', 'jewelleryshop', 'toystore', 'restaurant_kitchen', 'hospitalroom', 'movietheater', 'bedroom', 'bookstore', 'bathroom', 'fastfood_restaurant', 'garage', 'gameroom', 'grocerystore', 'operating_room', 'subway', 'buffet', 'gym', 'shoeshop', 'greenhouse', 'tv_studio', 'winecellar', 'dining_room', 'mall', 'stairscase', 'restaurant', 'bar', 'clothingstore', 'cloister', 'studiomusic', 'casino', 'nursery', 'meeting_room', 'pantry', 'prisoncell', 'hairsalon', 'children_room', 'bowling', 'inside_bus'] The SORTED Indoor Image labels = ['airport_inside', 'artstudio', 'auditorium', 'bakery', 'bar', 'bathroom', 'bedroom', 'bookstore', 'bowling', 'buffet', 'casino', 'children_room', 'church_inside', 'classroom', 'cloister', 'closet', 'clothingstore', 'computerroom', 'concert_hall', 'corridor', 'deli', 'dentaloffice', 'dining_room', 'elevator', 'fastfood_restaurant', 'florist', 'gameroom', 'garage', 'greenhouse', 'grocerystore', 'gym', 'hairsalon', 'hospitalroom', 'inside_bus', 'inside_subway', 'jewelleryshop', 'kindergarden', 'kitchen', 'mall', 'meeting_room', 'movietheater', 'museum', 'nursery', 'office', 'operating_room', 'pantry', 'poolinside', 'prisoncell', 'restaurant', 'restaurant_kitchen', 'shoeshop', 'stairscase', 'studiomusic', 'subway', 'toystore', 'trainstation', 'tv_studio', 'videostore', 'waitingroom', 'warehouse', 'winecellar'] There are 61 classes of Indoor Images.
Confirmed no bad images
img_paths = glob.glob(os.path.join(img_dir,'*/*.*')) # assuming you point to the directory containing the label folders.
bad_paths = []
for image_path in img_paths:
try:
img_bytes = tf.io.read_file(image_path)
decoded_img = tf.io.decode_image(img_bytes)
except tf.errors.InvalidArgumentError as e:
print(f"Found bad path {image_path}...{e}")
bad_paths.append(image_path)
#print(f"{image_path}: OK")
print("BAD PATHS:")
for bad_path in bad_paths:
print(f"{bad_path}")
BAD PATHS:
# Imagenet stats
means = [0.485, 0.456, 0.406]
stds = [0.229, 0.224, 0.225]
# Using Imagenet means and stds
transform = torchvision.transforms.Compose(
[ torchvision.transforms.Resize((256,256)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(means, stds) # normalize images here
])
dataset = datasets.ImageFolder(img_dir, transform=transform)
# Define the desired split ratio
train_ratio = 0.60
# Use random_split twice to create training and test data (no validation)
train_size = int(len(dataset) * train_ratio)
test_size = len(dataset) - train_size
temp_dataset, nouse = random_split(dataset, [train_size, test_size], generator = torch.Generator().manual_seed(SEED))
# Training will be 40% of the total original data and test will be 20%
train_ratio = (2/3)
train_size = int(len(temp_dataset) * train_ratio)
test_size = len(temp_dataset) - train_size
train_dataset, test_dataset = random_split(temp_dataset, [train_size, test_size], generator = torch.Generator().manual_seed(SEED))
# specify batch size etc
batch_size = 32
# Create separate DataLoaders for training, validation, and test
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
for images, labels in train_loader:
print(f' The images shape = {images.shape}, and the label encoding = {labels}')
break
print(f'Number of batches in train_loader is: {len(train_loader)}.')
print(f'Number of batches in test_loader is: {len(test_loader)}.')
The images shape = torch.Size([32, 3, 256, 256]), and the label encoding = tensor([11, 30, 19, 20, 26, 2, 10, 33, 7, 30, 1, 60, 59, 51, 3, 1, 37, 34,
8, 37, 30, 29, 52, 8, 11, 55, 17, 55, 31, 48, 22, 37])
Number of batches in train_loader is: 176.
Number of batches in test_loader is: 88.
plt.figure(figsize=(12,12))
for i in range(16): # Only plotting 16 images
plt.subplot(4,4,i+1)
plt.xticks([])
plt.yticks([])
# Unnormalize it using an inverse transformation with means and stds from before
img = images[i]
inverse_normalize = torchvision.transforms.Normalize(
mean= -1*np.array(means)/np.array(stds),
std= 1/np.array(stds)
)
org_imgorg_img = inverse_normalize(img)
#convert Tensor to a numpy array
npimg = img.numpy()
# Clip values between 0 and 1 due to rounding before
npimg = np.clip(npimg, 0, 1)
# each image in PyTorch is in [C, H, W]; where C =axis 0; H =axis =1, W =axis =2
# Display the image, imshow only support(H, W, C)
# we need to use np.transpose to rearrange the axis
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.xlabel(IndoorImage_dir[labels[i]])
plt.show()
# Running on GPU, make sure model and data is set to send to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 254, 254] 896
MaxPool2d-2 [-1, 32, 127, 127] 0
Conv2d-3 [-1, 64, 125, 125] 18,496
MaxPool2d-4 [-1, 64, 62, 62] 0
Conv2d-5 [-1, 128, 60, 60] 73,856
MaxPool2d-6 [-1, 128, 30, 30] 0
Linear-7 [-1, 256] 29,491,456
Linear-8 [-1, 128] 32,896
Linear-9 [-1, 64] 8,256
Linear-10 [-1, 61] 3,965
================================================================
Total params: 29,629,821
Trainable params: 29,629,821
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 33.59
Params size (MB): 113.03
Estimated Total Size (MB): 147.37
----------------------------------------------------------------
# Redefine CNNNet with dropouts
class CNNNet(nn.Module):
def __init__(self):
super(CNNNet, self).__init__()
self.conv1 = nn.Conv2d(3, 32, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv3 = nn.Conv2d(64, 128, 3)
# the in_features of 128 * 30 * 30 was determined using the print trick
self.fc1 = nn.Linear(128 * 30 * 30, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, 61) # 61 unique class labels
self.dropout = nn.Dropout(0.2) # Dropout in init here
def forward(self, x):
# print(x.shape)
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
#print(f' x= {x.shape}')
#sys.exit()
x = x.view(-1, 128 * 30 * 30)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.dropout(self.fc4(x)) # Dropout in forward here
return x
CNN_Model = CNNNet()
CNN_Model.to(device)
summary(CNN_Model,(3, 256, 256))
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 254, 254] 896
MaxPool2d-2 [-1, 32, 127, 127] 0
Conv2d-3 [-1, 64, 125, 125] 18,496
MaxPool2d-4 [-1, 64, 62, 62] 0
Conv2d-5 [-1, 128, 60, 60] 73,856
MaxPool2d-6 [-1, 128, 30, 30] 0
Linear-7 [-1, 256] 29,491,456
Linear-8 [-1, 128] 32,896
Linear-9 [-1, 64] 8,256
Linear-10 [-1, 61] 3,965
Dropout-11 [-1, 61] 0
================================================================
Total params: 29,629,821
Trainable params: 29,629,821
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 33.59
Params size (MB): 113.03
Estimated Total Size (MB): 147.37
----------------------------------------------------------------
# Reload the data using AutoAugment in transform for data augmentation
transform_aug = torchvision.transforms.Compose(
[ torchvision.transforms.Resize((256,256)),
torchvision.transforms.AutoAugment(policy=torchvision.transforms.AutoAugmentPolicy.IMAGENET), #AutoAugment for data augmentation
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(means, stds) # normalize images here
])
dataset_aug = datasets.ImageFolder(img_dir, transform=transform_aug)
# Define the desired split ratio
train_ratio = 0.60
# Use random_split twice to create training and test data (no validation)
train_size = int(len(dataset) * train_ratio)
test_size = len(dataset) - train_size
temp_dataset, nouse = random_split(dataset, [train_size, test_size], generator = torch.Generator().manual_seed(777))
# Training will be 40% of the total original data and test will be 20%
train_ratio = (2/3)
train_size = int(len(temp_dataset) * train_ratio)
test_size = len(temp_dataset) - train_size
train_dataset, test_dataset = random_split(temp_dataset, [train_size, test_size], generator = torch.Generator().manual_seed(777))
# specify batch size etc
batch_size = 32
# Create separate DataLoaders for training, validation, and test
train_loader_aug = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader_aug = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
for images, labels in train_loader_aug:
print(f' The images shape = {images.shape}, and the label encoding = {labels}')
break
images, labels = images.to(device), labels.to(device)
print(f'Number of batches in train_loader_aug is: {len(train_loader_aug)}.')
print(f'Number of batches in test_loader_aug is: {len(test_loader_aug)}.')
The images shape = torch.Size([32, 3, 256, 256]), and the label encoding = tensor([ 7, 10, 35, 31, 44, 37, 56, 37, 53, 3, 28, 6, 55, 48, 37, 27, 22, 6,
45, 29, 53, 22, 19, 0, 8, 0, 10, 49, 46, 41, 37, 60])
Number of batches in train_loader_aug is: 176.
Number of batches in test_loader_aug is: 88.
%%time
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(CNN_Model.parameters(), lr=0.001, momentum=0.9)
# Train the model
report_batchs = 160
EPOCHS = 20
# Pass the whole dataset multiple times to CNN Model
for epoch in range(EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_loader_aug, 0):
# Get the inputs/features and labels
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# Reset gradients = 0
optimizer.zero_grad()
# Forward progagation to compute the outputs
outputs = CNN_Model(inputs)
loss = criterion(outputs, labels)
# Backward propagation and compute the gradient
loss.backward()
# Update the weights
optimizer.step()
# print statistics
running_loss += loss.item()
if i % report_batchs == report_batchs -1:
print(f'[Epoch: {epoch + 1}, batch: [{i+1-report_batchs} to {i+1}]] loss ={running_loss / report_batchs:.2f}')
running_loss = 0.0
print(f'Completed Training of {EPOCHS} epochs!')
[Epoch: 1, batch: [0 to 160]] loss =4.10 [Epoch: 2, batch: [0 to 160]] loss =4.00 [Epoch: 3, batch: [0 to 160]] loss =3.92 [Epoch: 4, batch: [0 to 160]] loss =3.88 [Epoch: 5, batch: [0 to 160]] loss =3.84 [Epoch: 6, batch: [0 to 160]] loss =3.79 [Epoch: 7, batch: [0 to 160]] loss =3.74 [Epoch: 8, batch: [0 to 160]] loss =3.70 [Epoch: 9, batch: [0 to 160]] loss =3.65 [Epoch: 10, batch: [0 to 160]] loss =3.60 [Epoch: 11, batch: [0 to 160]] loss =3.52 [Epoch: 12, batch: [0 to 160]] loss =3.45 [Epoch: 13, batch: [0 to 160]] loss =3.36 [Epoch: 14, batch: [0 to 160]] loss =3.29 [Epoch: 15, batch: [0 to 160]] loss =3.21 [Epoch: 16, batch: [0 to 160]] loss =3.07 [Epoch: 17, batch: [0 to 160]] loss =2.94 [Epoch: 18, batch: [0 to 160]] loss =2.75 [Epoch: 19, batch: [0 to 160]] loss =2.42 [Epoch: 20, batch: [0 to 160]] loss =2.13 Completed Training of 20 epochs! CPU times: user 18min 58s, sys: 9.69 s, total: 19min 8s Wall time: 19min 44s
for images, labels in test_loader_aug:
break
images, labels = images.to(device), labels.to(device) # GPU
outputs = CNN_Model(images)
# Take out the index of the maximum value
_, predicted_labels = torch.max(outputs, 1)
plt.figure(figsize=(12,12))
for i in range(16): # View 16 images only
plt.subplot(4,4,i+1)
plt.xticks([])
plt.yticks([])
# Unnormalize it
img = images[i]
inverse_normalize = torchvision.transforms.Normalize(
mean= -1*np.array(means)/np.array(stds),
std= 1/np.array(stds)
)
org_imgorg_img = inverse_normalize(img)
cpu_tensor = org_imgorg_img.cpu()
#convert Tensor to a numpy array
npimg = cpu_tensor.numpy()
# Clip values between 0 and 1 due to rounding before
npimg = np.clip(npimg, 0, 1)
# Display the image, imshow only support(H, W, C)
plt.imshow(np.transpose(npimg, (1, 2, 0)))
# Label it with x being predicted and y being actual
plt.xlabel(IndoorImage_dir[predicted_labels[i]])
plt.ylabel(IndoorImage_dir[labels[i]])
plt.show()
%%time
class_correct = list(0. for i in range(61))
class_total = list(0. for i in range(61))
y_test = []
predictions = []
# No need to estimate gradient
with torch.no_grad():
# loop over the test dataset by taking out a batch of size 4 each time
for data in test_loader:
# Seperate the features and labels
images, labels = data
images, labels = images.to(device), labels.to(device) # GPU
# Forecast the labels for a batch of size 4
outputs = CNN_Model(images)
# Get the index of the maximum value , which is the predicted class index
_, predicted = torch.max(outputs, 1)
# Check wehter the predicted labels matche the actual label index
c = (predicted == labels).squeeze()
# Convert back to CPU for np
cpu_tensor1 = labels.cpu()
cpu_tensor2 = predicted.cpu()
y_test.append(cpu_tensor1.numpy())
predictions.append(cpu_tensor2.numpy())
# Handle one batch of size or the last batch with less images
for i in range(min(len(labels), batch_size)):
label = labels[i]
# Update the counters of corrected classes and total classes
class_correct[label] += c[i].item()
class_total[label] += 1
#print it out
for i in range(61):
print(f'Accuracy of {IndoorImage_dir[i]} = {100 * class_correct[i] / class_total[i]:.2f}')
Accuracy of airport_inside = 32.14 Accuracy of artstudio = 24.14 Accuracy of auditorium = 23.26 Accuracy of bakery = 32.63 Accuracy of bar = 29.52 Accuracy of bathroom = 28.89 Accuracy of bedroom = 53.79 Accuracy of bookstore = 37.88 Accuracy of bowling = 32.56 Accuracy of buffet = 4.35 Accuracy of casino = 26.60 Accuracy of children_room = 10.34 Accuracy of church_inside = 37.50 Accuracy of classroom = 21.74 Accuracy of cloister = 21.05 Accuracy of closet = 7.41 Accuracy of clothingstore = 33.33 Accuracy of computerroom = 21.05 Accuracy of concert_hall = 5.56 Accuracy of corridor = 52.50 Accuracy of deli = 22.64 Accuracy of dentaloffice = 16.67 Accuracy of dining_room = 28.85 Accuracy of elevator = 45.00 Accuracy of fastfood_restaurant = 8.00 Accuracy of florist = 20.00 Accuracy of gameroom = 15.79 Accuracy of garage = 23.08 Accuracy of greenhouse = 13.33 Accuracy of grocerystore = 32.00 Accuracy of gym = 24.56 Accuracy of hairsalon = 23.68 Accuracy of hospitalroom = 27.27 Accuracy of inside_bus = 16.67 Accuracy of inside_subway = 46.99 Accuracy of jewelleryshop = 8.33 Accuracy of kindergarden = 12.50 Accuracy of kitchen = 31.41 Accuracy of mall = 19.35 Accuracy of meeting_room = 31.37 Accuracy of movietheater = 51.35 Accuracy of museum = 26.47 Accuracy of nursery = 30.30 Accuracy of office = 8.33 Accuracy of operating_room = 39.39 Accuracy of pantry = 52.17 Accuracy of poolinside = 27.59 Accuracy of prisoncell = 4.35 Accuracy of restaurant = 19.80 Accuracy of restaurant_kitchen = 15.00 Accuracy of shoeshop = 14.29 Accuracy of stairscase = 21.43 Accuracy of studiomusic = 20.59 Accuracy of subway = 32.73 Accuracy of toystore = 8.62 Accuracy of trainstation = 17.86 Accuracy of tv_studio = 28.21 Accuracy of videostore = 4.76 Accuracy of waitingroom = 12.90 Accuracy of warehouse = 48.48 Accuracy of winecellar = 11.76 CPU times: user 23 s, sys: 215 ms, total: 23.2 s Wall time: 23.4 s
flat_y = [item for sublist in y_test for item in sublist]
flat_pred = [item for sublist in predictions for item in sublist]
# No confusion matrix due to 61 classes
print(classification_report(flat_y, flat_pred))
precision recall f1-score support
0 0.35 0.32 0.33 112
1 0.15 0.24 0.19 29
2 0.13 0.23 0.17 43
3 0.34 0.33 0.33 95
4 0.49 0.30 0.37 105
5 0.43 0.29 0.35 45
6 0.31 0.54 0.39 145
7 0.24 0.38 0.29 66
8 0.31 0.33 0.32 43
9 1.00 0.04 0.08 23
10 0.68 0.27 0.38 94
11 0.60 0.10 0.18 29
12 0.27 0.38 0.32 40
13 1.00 0.22 0.36 23
14 0.57 0.21 0.31 19
15 0.06 0.07 0.07 27
16 0.50 0.33 0.40 12
17 0.25 0.21 0.23 19
18 0.50 0.06 0.10 18
19 0.34 0.53 0.42 80
20 0.40 0.23 0.29 53
21 0.75 0.17 0.27 18
22 0.17 0.29 0.21 52
23 0.22 0.45 0.30 20
24 1.00 0.08 0.15 25
25 0.80 0.20 0.32 20
26 0.25 0.16 0.19 19
27 0.46 0.23 0.31 26
28 0.29 0.13 0.18 15
29 0.36 0.32 0.34 50
30 0.54 0.25 0.34 57
31 0.32 0.24 0.27 38
32 0.38 0.27 0.32 22
33 1.00 0.17 0.29 18
34 0.27 0.47 0.34 83
35 0.60 0.08 0.15 36
36 0.33 0.12 0.18 24
37 0.30 0.31 0.31 156
38 0.11 0.19 0.14 31
39 0.31 0.31 0.31 51
40 0.25 0.51 0.33 37
41 0.09 0.26 0.14 34
42 0.29 0.30 0.30 33
43 0.29 0.08 0.13 24
44 0.33 0.39 0.36 33
45 0.17 0.52 0.26 69
46 0.73 0.28 0.40 29
47 1.00 0.04 0.08 23
48 0.54 0.20 0.29 101
49 1.00 0.15 0.26 20
50 0.67 0.14 0.24 28
51 0.30 0.21 0.25 28
52 0.44 0.21 0.28 34
53 0.44 0.33 0.38 110
54 0.71 0.09 0.15 58
55 0.71 0.18 0.29 28
56 0.22 0.28 0.25 39
57 0.50 0.05 0.09 21
58 0.17 0.13 0.15 31
59 0.23 0.48 0.31 99
60 0.43 0.12 0.18 51
accuracy 0.29 2811
macro avg 0.44 0.25 0.26 2811
weighted avg 0.40 0.29 0.29 2811
The model could also have trained on more epochs given loss was still decreasing at the end of training. However, model 2 performs decently given 61 target classes with an overall accuracy of 29% and weighted f1-score of also 0.29. Compared to my model in TF (accuracy: 0.25, f1: 0.22), this PyTorch model actually slightly outperforms it.
Overall, I would say this is an okay model but definitely could have trained on more epochs if I did not run out of free GPU time on Colab.
model_resnet34 = torchvision.models.resnet34(weights = ResNet34_Weights.DEFAULT) # same as pretrained = True
for param in model_resnet34.parameters():
param.requires_grad = False
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_resnet34.fc.in_features
# Define our custom multi-layer classifier
classifier = nn.Sequential(
nn.Linear(num_ftrs, 512), # First hidden layer with 512 units
nn.ReLU(), # ReLU activation
nn.Dropout(p=0.2), # Dropout for regularization
nn.Linear(512, 61) # Output layer with num_classes units
)
# Replace the pre-trained model's classifier with our custom one
model_resnet34.fc = classifier
criterion = nn.CrossEntropyLoss()
# Observe that only parameters of final layer are being optimized as opposed to before.
optimizer = optim.SGD(model_resnet34.fc.parameters(), lr=0.001, momentum=0.9)
%%time
# Train the model
report_batchs = 160
EPOCHS = 20
model_resnet34 = model_resnet34.to(device) # Set model to GPU
# Pass the whole dataset multiple times to pretrained Model
for epoch in range(EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
# Get the inputs/features and labels
inputs, labels = data
# Running on GPU
inputs, labels = inputs.to(device), labels.to(device)
# Reset gradients = 0
optimizer.zero_grad()
# Forward progagation to compute the outputs
outputs = model_resnet34(inputs)
loss = criterion(outputs, labels)
# Backward propagation and compute the gradient
loss.backward()
# Update the weights
optimizer.step()
# print statistics
running_loss += loss.item()
if i % report_batchs == report_batchs -1:
print(f'[Epoch: {epoch + 1}, batch: [{i+1-report_batchs} to {i+1}]] loss ={running_loss / report_batchs:.2f}')
running_loss = 0.0
print(f'Completed Training of {EPOCHS} epochs!')
[Epoch: 1, batch: [0 to 160]] loss =3.80 [Epoch: 2, batch: [0 to 160]] loss =3.26 [Epoch: 3, batch: [0 to 160]] loss =2.77 [Epoch: 4, batch: [0 to 160]] loss =2.36 [Epoch: 5, batch: [0 to 160]] loss =2.08 [Epoch: 6, batch: [0 to 160]] loss =1.85 [Epoch: 7, batch: [0 to 160]] loss =1.70 [Epoch: 8, batch: [0 to 160]] loss =1.55 [Epoch: 9, batch: [0 to 160]] loss =1.47 [Epoch: 10, batch: [0 to 160]] loss =1.38 [Epoch: 11, batch: [0 to 160]] loss =1.31 [Epoch: 12, batch: [0 to 160]] loss =1.26 [Epoch: 13, batch: [0 to 160]] loss =1.21 [Epoch: 14, batch: [0 to 160]] loss =1.16 [Epoch: 15, batch: [0 to 160]] loss =1.11 [Epoch: 16, batch: [0 to 160]] loss =1.09 [Epoch: 17, batch: [0 to 160]] loss =1.06 [Epoch: 18, batch: [0 to 160]] loss =1.00 [Epoch: 19, batch: [0 to 160]] loss =0.97 [Epoch: 20, batch: [0 to 160]] loss =0.96 Completed Training of 20 epochs! CPU times: user 18min 9s, sys: 16.4 s, total: 18min 26s Wall time: 18min 49s
for images, labels in test_loader:
break
images, labels = images.to(device), labels.to(device) # GPU
outputs = model_resnet34(images)
# Take out the index of the maximum value
_, predicted_labels = torch.max(outputs, 1)
plt.figure(figsize=(12,12))
for i in range(16): # View 16 images only
plt.subplot(4,4,i+1)
plt.xticks([])
plt.yticks([])
# Unnormalize it
img = images[i]
inverse_normalize = torchvision.transforms.Normalize(
mean= -1*np.array(means)/np.array(stds),
std= 1/np.array(stds)
)
org_imgorg_img = inverse_normalize(img)
cpu_tensor = org_imgorg_img.cpu()
#convert Tensor to a numpy array
npimg = cpu_tensor.numpy()
# Clip values between 0 and 1 due to rounding before
npimg = np.clip(npimg, 0, 1)
# Display the image, imshow only support(H, W, C)
plt.imshow(np.transpose(npimg, (1, 2, 0)))
# Label it with x being predicted and y being actual
plt.xlabel(IndoorImage_dir[predicted_labels[i]])
plt.ylabel(IndoorImage_dir[labels[i]])
plt.show()
%%time
class_correct = list(0. for i in range(61))
class_total = list(0. for i in range(61))
y_test = []
predictions = []
# No need to estimate gradient
with torch.no_grad():
# loop over the test dataset by taking out a batch of size 4 each time
for data in test_loader:
# Seperate the features and labels
images, labels = data
images, labels = images.to(device), labels.to(device) # GPU
# Forecast the labels for a batch of size 4
outputs = model_resnet34(images)
# Get the index of the maximum value , which is the predicted class index
_, predicted = torch.max(outputs, 1)
# Check wehter the predicted labels matche the actual label index
c = (predicted == labels).squeeze()
# Convert back to CPU for np
cpu_tensor1 = labels.cpu()
cpu_tensor2 = predicted.cpu()
y_test.append(cpu_tensor1.numpy())
predictions.append(cpu_tensor2.numpy())
# Handle one batch of size or the last batch with less images
for i in range(min(len(labels), batch_size)):
label = labels[i]
# Update the counters of corrected classes and total classes
class_correct[label] += c[i].item()
class_total[label] += 1
#print it out
for i in range(61):
print(f'Accuracy of {IndoorImage_dir[i]} = {100 * class_correct[i] / class_total[i]:.2f}')
Accuracy of airport_inside = 66.07 Accuracy of artstudio = 31.03 Accuracy of auditorium = 51.16 Accuracy of bakery = 77.89 Accuracy of bar = 59.05 Accuracy of bathroom = 68.89 Accuracy of bedroom = 79.31 Accuracy of bookstore = 72.73 Accuracy of bowling = 69.77 Accuracy of buffet = 47.83 Accuracy of casino = 84.04 Accuracy of children_room = 31.03 Accuracy of church_inside = 82.50 Accuracy of classroom = 56.52 Accuracy of cloister = 78.95 Accuracy of closet = 70.37 Accuracy of clothingstore = 16.67 Accuracy of computerroom = 31.58 Accuracy of concert_hall = 22.22 Accuracy of corridor = 70.00 Accuracy of deli = 32.08 Accuracy of dentaloffice = 55.56 Accuracy of dining_room = 51.92 Accuracy of elevator = 40.00 Accuracy of fastfood_restaurant = 52.00 Accuracy of florist = 85.00 Accuracy of gameroom = 73.68 Accuracy of garage = 26.92 Accuracy of greenhouse = 93.33 Accuracy of grocerystore = 62.00 Accuracy of gym = 77.19 Accuracy of hairsalon = 50.00 Accuracy of hospitalroom = 40.91 Accuracy of inside_bus = 38.89 Accuracy of inside_subway = 75.90 Accuracy of jewelleryshop = 27.78 Accuracy of kindergarden = 50.00 Accuracy of kitchen = 82.69 Accuracy of mall = 29.03 Accuracy of meeting_room = 76.47 Accuracy of movietheater = 56.76 Accuracy of museum = 35.29 Accuracy of nursery = 81.82 Accuracy of office = 41.67 Accuracy of operating_room = 48.48 Accuracy of pantry = 84.06 Accuracy of poolinside = 75.86 Accuracy of prisoncell = 43.48 Accuracy of restaurant = 68.32 Accuracy of restaurant_kitchen = 30.00 Accuracy of shoeshop = 28.57 Accuracy of stairscase = 71.43 Accuracy of studiomusic = 76.47 Accuracy of subway = 72.73 Accuracy of toystore = 72.41 Accuracy of trainstation = 28.57 Accuracy of tv_studio = 30.77 Accuracy of videostore = 47.62 Accuracy of waitingroom = 29.03 Accuracy of warehouse = 86.87 Accuracy of winecellar = 70.59 CPU times: user 26.8 s, sys: 498 ms, total: 27.3 s Wall time: 27.4 s
flat_y = [item for sublist in y_test for item in sublist]
flat_pred = [item for sublist in predictions for item in sublist]
# No confusion matrix due to 61 classes
print(classification_report(flat_y, flat_pred))
precision recall f1-score support
0 0.51 0.66 0.57 112
1 0.43 0.31 0.36 29
2 0.51 0.51 0.51 43
3 0.59 0.78 0.67 95
4 0.54 0.59 0.56 105
5 0.74 0.69 0.71 45
6 0.85 0.79 0.82 145
7 0.81 0.73 0.77 66
8 0.91 0.70 0.79 43
9 0.61 0.48 0.54 23
10 0.76 0.84 0.80 94
11 0.47 0.31 0.38 29
12 0.79 0.82 0.80 40
13 0.50 0.57 0.53 23
14 0.68 0.79 0.73 19
15 0.86 0.70 0.78 27
16 0.29 0.17 0.21 12
17 0.50 0.32 0.39 19
18 0.33 0.22 0.27 18
19 0.79 0.70 0.74 80
20 0.37 0.32 0.34 53
21 0.43 0.56 0.49 18
22 0.51 0.52 0.51 52
23 0.73 0.40 0.52 20
24 0.62 0.52 0.57 25
25 0.85 0.85 0.85 20
26 0.70 0.74 0.72 19
27 0.54 0.27 0.36 26
28 0.78 0.93 0.85 15
29 0.72 0.62 0.67 50
30 0.72 0.77 0.75 57
31 0.35 0.50 0.41 38
32 0.69 0.41 0.51 22
33 0.88 0.39 0.54 18
34 0.81 0.76 0.78 83
35 0.37 0.28 0.32 36
36 0.39 0.50 0.44 24
37 0.78 0.83 0.80 156
38 0.56 0.29 0.38 31
39 0.71 0.76 0.74 51
40 0.62 0.57 0.59 37
41 0.48 0.35 0.41 34
42 0.82 0.82 0.82 33
43 0.56 0.42 0.48 24
44 0.59 0.48 0.53 33
45 0.76 0.84 0.80 69
46 0.67 0.76 0.71 29
47 0.62 0.43 0.51 23
48 0.41 0.68 0.51 101
49 0.67 0.30 0.41 20
50 0.80 0.29 0.42 28
51 0.83 0.71 0.77 28
52 0.84 0.76 0.80 34
53 0.62 0.73 0.67 110
54 0.48 0.72 0.58 58
55 0.62 0.29 0.39 28
56 0.67 0.31 0.42 39
57 0.83 0.48 0.61 21
58 0.45 0.29 0.35 31
59 0.68 0.87 0.76 99
60 0.73 0.71 0.72 51
accuracy 0.64 2811
macro avg 0.63 0.57 0.59 2811
weighted avg 0.65 0.64 0.63 2811
Similar to the pre-trained model in my TF notebook, the pretrained resnet model was the best model with an overall accuracy of 64% and weighted f1-score of 0.63, which are basically the same scores I got in my TF notebook.
While the loss was still decreasing at the end of training, epochs 19 to 20 resulted in a decrease in loss of only 0.01. Given more GPU time, I would have run more training epoches until loss stopped decreasing.
Overall, the pre-trained model produces the best scores in accuracy and f-1 on the test data and is my recommended model.
%%shell
jupyter nbconvert --to html /content/MITIndoorPyTorch.ipynb